#! /usr/bin/env/python3
# _*_ coding:utf-8 _*_
'''
    练习爬虫：获取图片 http://www.win4000.com/meitu.html
    作者：王楠
    日期：2020/03/05
    功能：
    1.0：获取图片url存到list    request   bs4
    2.0：下载图片到本地    os  tkinter
    3.0：增加所有页面 http://www.win4000.com/meinvtag4_1.html   meinv_tag1 2 3 4
    4.0: 修修补补，增加选择tag的选项，接受用户输入tag编号
'''
import requests as re
from bs4 import BeautifulSoup as bs
import tkinter as tk
from tkinter.filedialog import askdirectory
import urllib.request
import os


def get_html_soup(url):
    '''
    函数：根据网页返回soup对象
    '''
    r = re.get(url, timeout=30)
    s = bs(r.text, 'html.parser')
    return s


def get_url_dict(tag_name, tag_url):
    '''
    获取所有this tag_url tag_name页面上，每一个图片集的地址
    '''
    count = 0  # 记录图集个数
    url_dict = dict()  # 存放每一个图片集名称和地址
    print('开始获取《' + tag_name + '》下所有图集地址：')
    for i in range(1, 6):  # 共5个页面
        url = tag_url[:-6] + str(i) + '.html'
        s = get_html_soup(url)
        div_list_s = s.find('div', class_='Left_bar')
        for li_list_s in div_list_s.find_all('li'):
            # print(li_list_s)
            url_values = li_list_s.find('a')['href']
            url_name = li_list_s.find('img')['title']
            # print(url_name)
            # print(url_values)
            url_dict[url_name] = url_values  # 把图集名称和地址存进字典
            # rint('***************')
            count += 1  # 计数器加一
            if count % 5 == 0:
                print('正在获取第{}个图集...'.format(count))
    print('《' + tag_name + '》所有图集地址获取完毕，共获取到了{}个图集地址。'.format(count))
    # for name, url in url_dict.items():
    #     print(name+' : '+url)
    return url_dict


def get_image_url(name, url):
    '''
    获取每一个图集的图片地址
    '''
    url_list = []  # 存放一个图集的图片地址
    s = get_html_soup(url)
    max_num_of_image = int(s.find('em').text)
    # print(name + '  该图集共有{}张图：'.format(max_num_of_image))
    image_url_s = s.find('img', class_='pic-large')
    # print(image_url_s)
    url_list.append(image_url_s['data-original'])  # 第一张图的地址
    for i in range(2, max_num_of_image + 1):
        url_n = url[:-5] + '_' + str(i) + '.html'
        s = get_html_soup(url_n)
        image_url_s = s.find('img', class_='pic-large')
        url_list.append(image_url_s['data-original'])
    return url_list


def down_save_image(tag_name, filepath, all_images_dict):
    '''
    根据图集名称，以及图集的图片地址下载图片
    '''
    def callbackfunc(blocknum, blocksize, totalsize):
        '''
            进度计算函数：
            blocknum: 已经下载的数据块
            blocksize: 数据块的大小
            totalsize: 远程文件的大小
        '''
        percent = 100.0 * blocknum * blocksize / totalsize
        if percent > 100:
            percent = 100
            if percent % 10 == 0 and percent <= 100:
                print('图片已经下载{}%'.format(percent))

    images_num = 0
    num = 1
    for name, images_url_list in all_images_dict.items():
        if not os.path.exists(filepath + '/' + tag_name + '/' + name):
            os.makedirs(filepath + '/' + tag_name + '/' + name)
        count = 1
        for image_url in images_url_list:
            images_num += 1
            print('正在下载第{}个图片集：<'.format(num) + name +
                  '>的第{}张图片'.format(count))
            result = urllib.request.urlretrieve(image_url,
                                                filename=filepath + '/' +
                                                tag_name + '/' + name + '/' +
                                                str(count) + '.jpg',
                                                reporthook=callbackfunc,
                                                data=None)
            print('第{}个图片集：<'.format(num) + name +
                  '>的第{}张图片以保存在：'.format(count) + result[0])
            count += 1
        num += 1
    print('××××××××××××××××××××××××××××××××××××')
    print('下载完成,共下载{}张图。'.format(images_num))


def get_filepath(tag_name):
    '''
    获取文件存放地址
    '''
    print('选择文件保存位置')
    window = tk.Tk()
    window.withdraw()
    filepath = askdirectory(title=u'选择tag：《' + tag_name + '》的保存路径')
    while len(filepath) == 0:
        print('重新选择tag：《' + tag_name + '》的保存路径')
        filepath = askdirectory(title=u'选择tag：《' + tag_name + '》的保存路径')
    # print(filepath)
    return filepath


def get_all_tag():
    '''
    获取 http://www.win4000.com/meinvtag 下所有tag_
    '''
    key = dict()
    tag_dict = dict()
    for i in range(60):
        url = 'http://www.win4000.com/meinvtag' + str(i) + '_1.html'
        r = re.get(url, timeout=30)
        # print('i是{}'.format(i), r)
        if r.status_code == 200:
            # print('http://www.win4000.com/meinvtag' + str(i) + '_1.html')
            s = bs(r.text, 'html.parser')
            title_s = s.find('h2')
            title = title_s.text
            key[str(i)] = title
            tag_dict[title] = 'http://www.win4000.com/meinvtag' + str(
                i) + '_1.html'
            print('meinvtag编号{}是：'.format(i), title)

    # print(key)
    # for i in tag_dict:
    #     print(i+' : '+tag_dict[i])
    return tag_dict, key


def main():
    '''
    主函数
    '''

    all_tag_dict, tag_num = get_all_tag()  # 获取tag的地址和名称的字典
    tag_count = 0  # 记录tag数量
    image_num = 0  # 记录图片数量
    all_tag_num = len(all_tag_dict)
    print('共计{}个tag\n************************************'.format(all_tag_num))
    a_or_s = input('a:下载所有。s:指定tag进行下载。(a/s):')
    down_tag_dict = dict()
    if a_or_s == 'a':
        down_tag_dict = all_tag_dict
    elif a_or_s == 's':
        tag_num_input_str = input('请输入需要下载的tag序号（多个用空格隔开）:')
        tag_num_input = tag_num_input_str.split(' ')
        for i in tag_num_input:
            for n, title in tag_num.items():
                if i == n:
                    down_tag_dict[title] = all_tag_dict[title]
        print('您制定的tag为：')
        for i in down_tag_dict:
            print(i)
    print('****************************\n开始下载')
    for tag_name, tag_url in down_tag_dict.items():
        tag_count += 1
        print('总计{}个tag，正在对第{}个tga<{}>进行检索和下载。'.format(len(down_tag_dict), tag_count,
                                                       tag_name))
        url_dict = get_url_dict(tag_name, tag_url)  # 获取改tag的所有图片集的地址
        all_images_dict = dict()  # 存放this tag所有图片的地址
        count = 0  # 记录进度
        nums = 0  # 总图数量
        for name, url in url_dict.items():
            count += 1
            print('开始获取《' + tag_name + '》的第{}个图集的图片的地址......'.format(count))
            image_all_list = get_image_url(name, url)
            all_images_dict[name] = image_all_list
            print('《' + tag_name + '》共{}个图集，第{}个图集的图片地址获取完成，该图集共有{}张图'.format(
                len(url_dict), count, len(image_all_list)))
            nums += len(image_all_list)
        print('《' + tag_name + '》的{}个图集的所有图集地址获取完毕，共图片{}张'.format(count, nums))
        image_num += nums
        # for i in all_images_dict:
        #     print(i)
        filepath = get_filepath(tag_name)
        down_save_image(tag_name, filepath, all_images_dict)
    print('共进行了{}个tag，共计{}张图'.format(tag_count, image_num))
    print('存放在:' + filepath + '/')


if __name__ == '__main__':
    main()
